Import Libraries¶

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from numpy import histogram
import plotly.io as pio
import plotly.express as px
import plotly.figure_factory as ff
from langdetect import detect
from sklearn.preprocessing import StandardScaler,OneHotEncoder,normalize
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor  
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.feature_selection import RFECV
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier,VotingClassifier,RandomForestClassifier,GradientBoostingClassifier
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import VotingRegressor
from category_encoders.binary import BinaryEncoder
from sklearn.preprocessing import RobustScaler, StandardScaler, PolynomialFeatures
from sklearn.metrics import accuracy_score,classification_report
from sklearn.metrics import r2_score
from xgboost import XGBClassifier
import warnings 
warnings.filterwarnings("ignore")

%matplotlib inline
sns.set(rc={'figure.figsize': [10, 10]}, font_scale=1.3)

Read File and Load Data¶

In [2]:
 
df = pd.read_csv("F:/Local Disk (D)/AI-python-EPSLION/Final Project-Reem Nabil-20231226/DataSet/heart_attack_prediction_dataset.csv")
df.head(5)
Out[2]:
Patient ID Age Sex Cholesterol Blood Pressure Heart Rate Diabetes Family History Smoking Obesity ... Sedentary Hours Per Day Income BMI Triglycerides Physical Activity Days Per Week Sleep Hours Per Day Country Continent Hemisphere Heart Attack Risk
0 BMW7812 67 Male 208 158/88 72 0 0 1 0 ... 6.615001 261404 31.251233 286 0 6 Argentina South America NaN 0
1 CZE1114 21 Male 389 165/93 98 1 1 1 1 ... 4.963459 285768 27.194973 235 1 7 Canada North America Northern Hemisphere 0
2 BNI9906 21 Female 324 174/99 72 1 0 0 0 ... 9.463426 235282 28.176571 587 4 4 France Europe Northern Hemisphere 0
3 JLN3497 84 Male 383 163/100 73 1 1 1 0 ... 7.648981 125640 36.464704 378 3 4 Canada North America Northern Hemisphere 0
4 GFO8847 66 Male 318 91/88 93 1 1 1 1 ... 1.514821 160555 21.809144 231 1 5 Thailand Asia Northern Hemisphere 0

5 rows × 26 columns

In [3]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8763 entries, 0 to 8762
Data columns (total 26 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Patient ID                       8763 non-null   object 
 1   Age                              8763 non-null   int64  
 2   Sex                              8763 non-null   object 
 3   Cholesterol                      8763 non-null   int64  
 4   Blood Pressure                   8763 non-null   object 
 5   Heart Rate                       8763 non-null   int64  
 6   Diabetes                         8763 non-null   int64  
 7   Family History                   8763 non-null   int64  
 8   Smoking                          8763 non-null   int64  
 9   Obesity                          8763 non-null   int64  
 10  Alcohol Consumption              8763 non-null   int64  
 11  Exercise Hours Per Week          8754 non-null   float64
 12  Diet                             8763 non-null   object 
 13  Previous Heart Problems          8763 non-null   int64  
 14  Medication Use                   8763 non-null   int64  
 15  Stress Level                     8763 non-null   int64  
 16  Sedentary Hours Per Day          8763 non-null   float64
 17  Income                           8763 non-null   int64  
 18  BMI                              8763 non-null   float64
 19  Triglycerides                    8763 non-null   int64  
 20  Physical Activity Days Per Week  8763 non-null   int64  
 21  Sleep Hours Per Day              8763 non-null   int64  
 22  Country                          8763 non-null   object 
 23  Continent                        8763 non-null   object 
 24  Hemisphere                       8756 non-null   object 
 25  Heart Attack Risk                8763 non-null   int64  
dtypes: float64(3), int64(16), object(7)
memory usage: 1.7+ MB
In [4]:
"""

describe command for categorical values only.

From here its considered that Males most susceptible to heart attack than females.


"""
df.describe(include = 'all')
Out[4]:
Patient ID Age Sex Cholesterol Blood Pressure Heart Rate Diabetes Family History Smoking Obesity ... Sedentary Hours Per Day Income BMI Triglycerides Physical Activity Days Per Week Sleep Hours Per Day Country Continent Hemisphere Heart Attack Risk
count 8763 8763.000000 8763 8763.000000 8763 8763.000000 8763.000000 8763.000000 8763.000000 8763.000000 ... 8763.000000 8763.000000 8763.000000 8763.000000 8763.000000 8763.000000 8763 8763 8756 8763.000000
unique 8763 NaN 2 NaN 3915 NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN 20 6 2 NaN
top BMW7812 NaN Male NaN 146/94 NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN Germany Asia Northern Hemisphere NaN
freq 1 NaN 6111 NaN 8 NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN 477 2543 5656 NaN
mean NaN 53.707977 NaN 259.877211 NaN 75.021682 0.652288 0.492982 0.896839 0.501426 ... 5.993690 158263.181901 28.891446 417.677051 3.489672 7.023508 NaN NaN NaN 0.358211
std NaN 21.249509 NaN 80.863276 NaN 20.550948 0.476271 0.499979 0.304186 0.500026 ... 3.466359 80575.190806 6.319181 223.748137 2.282687 1.988473 NaN NaN NaN 0.479502
min NaN 18.000000 NaN 120.000000 NaN 40.000000 0.000000 0.000000 0.000000 0.000000 ... 0.001263 20062.000000 18.002337 30.000000 0.000000 4.000000 NaN NaN NaN 0.000000
25% NaN 35.000000 NaN 192.000000 NaN 57.000000 0.000000 0.000000 1.000000 0.000000 ... 2.998794 88310.000000 23.422985 225.500000 2.000000 5.000000 NaN NaN NaN 0.000000
50% NaN 54.000000 NaN 259.000000 NaN 75.000000 1.000000 0.000000 1.000000 1.000000 ... 5.933622 157866.000000 28.768999 417.000000 3.000000 7.000000 NaN NaN NaN 0.000000
75% NaN 72.000000 NaN 330.000000 NaN 93.000000 1.000000 1.000000 1.000000 1.000000 ... 9.019124 227749.000000 34.324594 612.000000 5.000000 9.000000 NaN NaN NaN 1.000000
max NaN 90.000000 NaN 400.000000 NaN 110.000000 1.000000 1.000000 1.000000 1.000000 ... 11.999313 299954.000000 39.997211 800.000000 7.000000 10.000000 NaN NaN NaN 1.000000

11 rows × 26 columns

In [5]:
#check dublication 
df.duplicated().sum()
Out[5]:
0

Missing Values¶

In [6]:
'''
missing data here in Exercise Hours Per Week & Hemisphere
'''
df.isna().sum()
Out[6]:
Patient ID                         0
Age                                0
Sex                                0
Cholesterol                        0
Blood Pressure                     0
Heart Rate                         0
Diabetes                           0
Family History                     0
Smoking                            0
Obesity                            0
Alcohol Consumption                0
Exercise Hours Per Week            9
Diet                               0
Previous Heart Problems            0
Medication Use                     0
Stress Level                       0
Sedentary Hours Per Day            0
Income                             0
BMI                                0
Triglycerides                      0
Physical Activity Days Per Week    0
Sleep Hours Per Day                0
Country                            0
Continent                          0
Hemisphere                         7
Heart Attack Risk                  0
dtype: int64
In [7]:
df['Hemisphere'].isnull().sum()
Out[7]:
7
In [8]:
'''
here to deal with missing values :
fill missng data.
For Categorical Data ==> Fill with Mode (Most frequent).
for numerical data ==> fill with mean or median depend on is there any outliers or not.

in dataset the patients resides more in "Northern Hemisphere".

'''
df['Hemisphere'].mode()
Out[8]:
0    Northern Hemisphere
Name: Hemisphere, dtype: object

Fillna missing categorical data with Mode¶

In [9]:
df['Hemisphere'].fillna(df['Hemisphere'].mode()[0], inplace=True)
In [10]:
'''
after fill with mode ; there is no missing data.
'''
df['Hemisphere'].isnull().sum()
Out[10]:
0

Check outliers of missing numeriacl data by box plot¶

In [11]:
  
fig = px.box(df, y='Exercise Hours Per Week')


fig.show()

Fillna Missing numerical data with Median¶

In [12]:
'''
1) from above boxplot ==> there is no outliers affetcted the data of  "Exercise Hours Per Week".
2) fillna with median.

'''
df['Exercise Hours Per Week'].median()
Out[12]:
10.070897344999999
In [13]:
'''
i can make fillna to Rating with mean or meadian as outliers is not affect.

here i make fillna to Rating with median .
'''
df['Exercise Hours Per Week'].fillna(df['Exercise Hours Per Week'].median(), inplace=True)
In [14]:
df.isna().sum()
Out[14]:
Patient ID                         0
Age                                0
Sex                                0
Cholesterol                        0
Blood Pressure                     0
Heart Rate                         0
Diabetes                           0
Family History                     0
Smoking                            0
Obesity                            0
Alcohol Consumption                0
Exercise Hours Per Week            0
Diet                               0
Previous Heart Problems            0
Medication Use                     0
Stress Level                       0
Sedentary Hours Per Day            0
Income                             0
BMI                                0
Triglycerides                      0
Physical Activity Days Per Week    0
Sleep Hours Per Day                0
Country                            0
Continent                          0
Hemisphere                         0
Heart Attack Risk                  0
dtype: int64

Exploratory Data Analysis (EDA)¶

In [15]:
df.columns
Out[15]:
Index(['Patient ID', 'Age', 'Sex', 'Cholesterol', 'Blood Pressure',
       'Heart Rate', 'Diabetes', 'Family History', 'Smoking', 'Obesity',
       'Alcohol Consumption', 'Exercise Hours Per Week', 'Diet',
       'Previous Heart Problems', 'Medication Use', 'Stress Level',
       'Sedentary Hours Per Day', 'Income', 'BMI', 'Triglycerides',
       'Physical Activity Days Per Week', 'Sleep Hours Per Day', 'Country',
       'Continent', 'Hemisphere', 'Heart Attack Risk'],
      dtype='object')

Univariates¶

In [16]:
sns.histplot(df, x="Sex",stat='percent')
Out[16]:
<Axes: xlabel='Sex', ylabel='Percent'>
In [17]:
plt.pie(df['Diet'].value_counts(),startangle=90,autopct='%.3f',
labels = ['Healthy','Average','Unhealthy'],shadow = 'True')
Out[17]:
([<matplotlib.patches.Wedge at 0x23e3e0f5490>,
  <matplotlib.patches.Wedge at 0x23e3e0f7210>,
  <matplotlib.patches.Wedge at 0x23e3e109510>],
 [Text(-0.9602245610266066, 0.5366272378488263, 'Healthy'),
  Text(0.027207959994841456, -1.0996634607519336, 'Average'),
  Text(0.9466575216017192, 0.5602138313134464, 'Unhealthy')],
 [Text(-0.5237588514690581, 0.2927057660993598, '33.778'),
  Text(0.0148407054517317, -0.5998164331374183, '33.231'),
  Text(0.5163586481463923, 0.30557118071642525, '32.991')])
In [18]:
fig = px.histogram(df,x='Continent',histnorm='percent')
fig.show()
In [19]:
sns.histplot(df, x="Hemisphere",stat='percent')
Out[19]:
<Axes: xlabel='Hemisphere', ylabel='Percent'>
In [20]:
fig = px.histogram(df,x='Country',histnorm='percent')
fig.show()
In [21]:
categorical_columns = ['Sex', 'Diabetes', 'Family History', 'Smoking', 'Obesity', 'Alcohol Consumption', 'Diet', 'Previous Heart Problems', 'Medication Use', 'Country', 'Continent', 'Hemisphere']
# Set up the number of rows and columns for subplots
num_rows = len(categorical_columns) // 3
num_cols = min(3, len(categorical_columns))
fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, num_rows * 5))
fig.subplots_adjust(hspace=0.5)
axes = axes.flatten()
for i, column in enumerate(categorical_columns):
    
    counts = df[column].value_counts()
    axes[i].pie(counts, labels=counts.index, autopct='%1.1f%%', startangle=90, colors=sns.color_palette('pastel'))
    axes[i].set_title(column)
plt.show()

Feature Engineer and Bivariate¶

In [22]:
'''
Splitting "Blood Pressure" into  Diastolic and Systolic 
'''
df['Systolic'] = df['Blood Pressure'].apply(lambda x: x.split('/')[0])  # high 
df['Diastolic'] = df['Blood Pressure'].apply(lambda x: x.split('/')[1]) # low
In [23]:
df_COU_BMI = df.groupby('Country')[['BMI']].mean().sort_values(by = "BMI")
df_COU_BMI
Out[23]:
BMI
Country
Japan 28.291157
France 28.555747
Vietnam 28.668896
Spain 28.712143
Colombia 28.714773
New Zealand 28.766462
Argentina 28.794898
Thailand 28.819812
China 28.861312
India 28.864451
Brazil 28.902022
United Kingdom 28.941320
Italy 29.004572
Nigeria 29.015281
Canada 29.052191
South Africa 29.075771
Australia 29.080740
Germany 29.098898
United States 29.165337
South Korea 29.458494
In [24]:
fig = px.bar(df_COU_BMI, y="BMI", title="Country-BMI Relation")
fig.show()
In [25]:
px.density_heatmap(df,x='Diet', y='Age')
In [26]:
sns.countplot(x ='Sex', hue = "Diabetes", data = df)
Out[26]:
<Axes: xlabel='Sex', ylabel='count'>
In [27]:
sns.countplot(x ='Sex', hue = "Obesity", data = df)
Out[27]:
<Axes: xlabel='Sex', ylabel='count'>
In [28]:
df.groupby('Continent')[['Heart Attack Risk']].mean().sort_values(by = "Heart Attack Risk")
Out[28]:
Heart Attack Risk
Continent
Europe 0.345828
Asia 0.353913
Australia 0.360860
South America 0.366373
Africa 0.368843
North America 0.376744
In [29]:
sns.violinplot(x = 'Heart Attack Risk', y = 'Continent', data = df)
Out[29]:
<Axes: xlabel='Heart Attack Risk', ylabel='Continent'>
In [30]:
df_COU_HEART = df.groupby('Country')[['Heart Attack Risk']].mean().sort_values(by = "Heart Attack Risk")
df_COU_HEART
Out[30]:
Heart Attack Risk
Country
India 0.313107
Italy 0.315545
Japan 0.332564
South Africa 0.338824
New Zealand 0.347126
Vietnam 0.348235
Spain 0.348837
United Kingdom 0.350109
France 0.352018
Brazil 0.352814
China 0.355505
Canada 0.359091
Germany 0.360587
Argentina 0.369427
Australia 0.374165
Thailand 0.376168
Colombia 0.377622
United States 0.395238
Nigeria 0.397321
South Korea 0.398533
In [31]:
fig = px.bar(df_COU_HEART, y="Heart Attack Risk", title="Country-Heart Attack Risk Relation")
fig.show()
In [32]:
px.density_heatmap(df,x="Stress Level", y="Income" ,title="Stress Level-Income Relation")
In [33]:
'''
==> In Univariate i use 6 visualization using "histplot" , "Pie chart" , "histogram".
 

==> In Bivariate i use 8 visualization using heatmap , violinplot ,sns.countplot , violinplot , px.bar.

univariate:
***********
From univariate diagrams the results as per below:
- 70 % from patients that mentioned in dataset were Males while 30 % from patients were females.
- The more percentage regarding "DIET" feature that mentioned in dataset considering "33.778" as healthy then "33.231" as average while "32.991".
- "Asia Continent" is existed with percentage 29% then "Europe" with percentage 25.57% and the less percenatge was for "Africa" then "North America".
- "Northen Hemisphere" is existed with percentage more than 60%.
- The more percenatge countries mentioned more in dataset was for "germany and Nigeria" while the less countries was belong to "south koria".
- The diabetes patients with percentage 65.2%
- 50.7% of patients have family history.
- 89.7% of patients are smoker and also 50.1 of them suffering "obesity".
- 59.8 % of patients consume alkhol.

Bivariate:
***********
From Bivariate diagrams the results as per below:
- The most of mean of BMI belong to "South Korea" , "United States" , "Germany"  while the less BMI mean belongs to "Japan" , "France" , "Vietnam".
- Age between "20" and "24" are unhealthy but more than 25 years till 29 years are healthy ;  and the ages between 80 and 84 unhealthy persons.
- Males with diabetes is more than females with diabets.
- Males with obesity more than males who are not obesity.
- "North America Continent" and "Africa " are the most suffering from "heart attack risk"  while for "Europe" and "Asia" are less suffer from "heart attack risk".
- The most average values of the countries who suffering from "heart attack risk" belong to "South Korea"  and "Nigeria"  while the less countries are "India" and "Italy".

'''
Out[33]:
'\n==> In Univariate i use 6 visualization using "histplot" , "Pie chart" , "histogram".\n \n\n==> In Bivariate i use 8 visualization using heatmap , violinplot ,sns.countplot , violinplot , px.bar.\n\nunivariate:\n***********\nFrom univariate diagrams the results as per below:\n- 70 % from patients that mentioned in dataset were Males while 30 % from patients were females.\n- The more percentage regarding "DIET" feature that mentioned in dataset considering "33.778" as healthy then "33.231" as average while "32.991".\n- "Asia Continent" is existed with percentage 29% then "Europe" with percentage 25.57% and the less percenatge was for "Africa" then "North America".\n- "Northen Hemisphere" is existed with percentage more than 60%.\n- The more percenatge countries mentioned more in dataset was for "germany and Nigeria" while the less countries was belong to "south koria".\n- The diabetes patients with percentage 65.2%\n- 50.7% of patients have family history.\n- 89.7% of patients are smoker and also 50.1 of them suffering "obesity".\n- 59.8 % of patients consume alkhol.\n\nBivariate:\n***********\nFrom Bivariate diagrams the results as per below:\n- The most of mean of BMI belong to "South Korea" , "United States" , "Germany"  while the less BMI mean belongs to "Japan" , "France" , "Vietnam".\n- Age between "20" and "24" are unhealthy but more than 25 years till 29 years are healthy ;  and the ages between 80 and 84 unhealthy persons.\n- Males with diabetes is more than females with diabets.\n- Males with obesity more than males who are not obesity.\n- "North America Continent" and "Africa " are the most suffering from "heart attack risk"  while for "Europe" and "Asia" are less suffer from "heart attack risk".\n- The most average values of the countries who suffering from "heart attack risk" belong to "South Korea"  and "Nigeria"  while the less countries are "India" and "Italy".\n\n'

Machine Learning¶

In [34]:
df.columns
Out[34]:
Index(['Patient ID', 'Age', 'Sex', 'Cholesterol', 'Blood Pressure',
       'Heart Rate', 'Diabetes', 'Family History', 'Smoking', 'Obesity',
       'Alcohol Consumption', 'Exercise Hours Per Week', 'Diet',
       'Previous Heart Problems', 'Medication Use', 'Stress Level',
       'Sedentary Hours Per Day', 'Income', 'BMI', 'Triglycerides',
       'Physical Activity Days Per Week', 'Sleep Hours Per Day', 'Country',
       'Continent', 'Hemisphere', 'Heart Attack Risk', 'Systolic',
       'Diastolic'],
      dtype='object')
In [35]:
'''
here i will drop Patient ID , Country , Continent ,Hemisphere , Blood Pressure.
'''
df.drop('Patient ID',axis=1,inplace = True)
df.drop('Country',axis=1,inplace = True)
df.drop('Continent',axis=1,inplace = True)
df.drop('Hemisphere',axis=1,inplace = True)
df.drop('Blood Pressure',axis=1,inplace = True)

convert categorical to numerical Values¶

In [36]:
'''
using Map Function in ordinal encodeing.
'''
ordinal = {'Unhealthy':0,'Average':1,'Healthy':2}
df['Diet'] = df['Diet'].map(ordinal)
In [37]:
df = pd.get_dummies(df, columns = ['Sex'])
In [38]:
df
Out[38]:
Age Cholesterol Heart Rate Diabetes Family History Smoking Obesity Alcohol Consumption Exercise Hours Per Week Diet ... Income BMI Triglycerides Physical Activity Days Per Week Sleep Hours Per Day Heart Attack Risk Systolic Diastolic Sex_Female Sex_Male
0 67 208 72 0 0 1 0 0 4.168189 1 ... 261404 31.251233 286 0 6 0 158 88 False True
1 21 389 98 1 1 1 1 1 1.813242 0 ... 285768 27.194973 235 1 7 0 165 93 False True
2 21 324 72 1 0 0 0 0 2.078353 2 ... 235282 28.176571 587 4 4 0 174 99 True False
3 84 383 73 1 1 1 0 1 9.828130 1 ... 125640 36.464704 378 3 4 0 163 100 False True
4 66 318 93 1 1 1 1 0 10.070897 0 ... 160555 21.809144 231 1 5 0 91 88 False True
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
8758 60 121 61 1 1 1 0 1 7.917342 2 ... 235420 19.655895 67 7 7 0 94 76 False True
8759 28 120 73 1 0 0 1 0 16.558426 2 ... 217881 23.993866 617 4 9 0 157 102 True False
8760 47 250 105 0 1 1 1 1 3.148438 1 ... 36998 35.406146 527 4 4 1 161 75 False True
8761 36 178 60 1 0 1 0 0 3.789950 0 ... 209943 27.294020 114 2 8 0 119 67 False True
8762 25 356 75 1 1 0 0 1 18.081748 2 ... 247338 32.914151 180 7 4 1 138 67 True False

8763 rows × 24 columns

In [39]:
df.head(1000)
Out[39]:
Age Cholesterol Heart Rate Diabetes Family History Smoking Obesity Alcohol Consumption Exercise Hours Per Week Diet ... Income BMI Triglycerides Physical Activity Days Per Week Sleep Hours Per Day Heart Attack Risk Systolic Diastolic Sex_Female Sex_Male
0 67 208 72 0 0 1 0 0 4.168189 1 ... 261404 31.251233 286 0 6 0 158 88 False True
1 21 389 98 1 1 1 1 1 1.813242 0 ... 285768 27.194973 235 1 7 0 165 93 False True
2 21 324 72 1 0 0 0 0 2.078353 2 ... 235282 28.176571 587 4 4 0 174 99 True False
3 84 383 73 1 1 1 0 1 9.828130 1 ... 125640 36.464704 378 3 4 0 163 100 False True
4 66 318 93 1 1 1 1 0 10.070897 0 ... 160555 21.809144 231 1 5 0 91 88 False True
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
995 79 366 52 0 1 1 1 0 16.490857 2 ... 190818 27.286712 197 2 5 0 109 76 True False
996 43 164 81 1 1 1 0 0 1.348300 1 ... 84977 19.068251 133 7 6 1 172 85 False True
997 31 127 88 1 1 0 1 0 7.569312 2 ... 101438 36.474585 218 1 10 0 147 108 True False
998 70 128 54 0 0 1 0 1 15.869561 2 ... 252528 23.111596 394 2 8 0 118 69 True False
999 59 284 61 1 0 1 1 0 14.422932 0 ... 111980 21.998049 566 7 4 0 141 84 False True

1000 rows × 24 columns

converting 'Object' and 'Boolean' Datatype into intiger¶

In [40]:
df.dtypes
Out[40]:
Age                                  int64
Cholesterol                          int64
Heart Rate                           int64
Diabetes                             int64
Family History                       int64
Smoking                              int64
Obesity                              int64
Alcohol Consumption                  int64
Exercise Hours Per Week            float64
Diet                                 int64
Previous Heart Problems              int64
Medication Use                       int64
Stress Level                         int64
Sedentary Hours Per Day            float64
Income                               int64
BMI                                float64
Triglycerides                        int64
Physical Activity Days Per Week      int64
Sleep Hours Per Day                  int64
Heart Attack Risk                    int64
Systolic                            object
Diastolic                           object
Sex_Female                            bool
Sex_Male                              bool
dtype: object
In [41]:
column = ['Systolic','Diastolic','Sex_Female','Sex_Male']
df[column] = df[column].astype(int)
In [42]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8763 entries, 0 to 8762
Data columns (total 24 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Age                              8763 non-null   int64  
 1   Cholesterol                      8763 non-null   int64  
 2   Heart Rate                       8763 non-null   int64  
 3   Diabetes                         8763 non-null   int64  
 4   Family History                   8763 non-null   int64  
 5   Smoking                          8763 non-null   int64  
 6   Obesity                          8763 non-null   int64  
 7   Alcohol Consumption              8763 non-null   int64  
 8   Exercise Hours Per Week          8763 non-null   float64
 9   Diet                             8763 non-null   int64  
 10  Previous Heart Problems          8763 non-null   int64  
 11  Medication Use                   8763 non-null   int64  
 12  Stress Level                     8763 non-null   int64  
 13  Sedentary Hours Per Day          8763 non-null   float64
 14  Income                           8763 non-null   int64  
 15  BMI                              8763 non-null   float64
 16  Triglycerides                    8763 non-null   int64  
 17  Physical Activity Days Per Week  8763 non-null   int64  
 18  Sleep Hours Per Day              8763 non-null   int64  
 19  Heart Attack Risk                8763 non-null   int64  
 20  Systolic                         8763 non-null   int32  
 21  Diastolic                        8763 non-null   int32  
 22  Sex_Female                       8763 non-null   int32  
 23  Sex_Male                         8763 non-null   int32  
dtypes: float64(3), int32(4), int64(17)
memory usage: 1.5 MB

Feature Selection Using High Correlation Filter¶

In [43]:
'''
This technique to display the correlation coefficients for different variables.

this tool to  identify and visualize patterns in the given data

'''

corr = df.corr()
corr
Out[43]:
Age Cholesterol Heart Rate Diabetes Family History Smoking Obesity Alcohol Consumption Exercise Hours Per Week Diet ... Income BMI Triglycerides Physical Activity Days Per Week Sleep Hours Per Day Heart Attack Risk Systolic Diastolic Sex_Female Sex_Male
Age 1.000000 -0.009107 -0.003844 -0.014105 0.008353 0.394891 -0.008140 -0.006666 0.001341 -0.013230 ... -0.001733 -0.002612 0.003415 0.001384 -0.002185 0.006403 0.003070 -0.009826 -0.020067 0.020067
Cholesterol -0.009107 1.000000 0.000315 -0.013428 -0.021608 0.016342 -0.014843 -0.007261 0.021546 -0.010765 ... 0.000007 0.017292 -0.005454 0.016056 0.004456 0.019340 0.000133 0.002083 -0.007614 0.007614
Heart Rate -0.003844 0.000315 1.000000 0.006764 -0.013470 -0.012331 0.012725 0.003459 0.008778 -0.003014 ... 0.004873 0.005299 0.012244 0.000834 0.001811 -0.004251 0.008482 -0.018113 0.010921 -0.010921
Diabetes -0.014105 -0.013428 0.006764 1.000000 -0.013844 0.000527 0.012866 0.005551 -0.006608 0.006156 ... -0.000759 -0.002852 0.010431 -0.002411 -0.012457 0.017225 -0.005306 -0.000512 -0.003582 0.003582
Family History 0.008353 -0.021608 -0.013470 -0.013844 1.000000 0.011748 -0.001444 0.012701 -0.006486 -0.001401 ... -0.000401 -0.011492 -0.001904 0.009561 -0.011199 -0.001652 -0.009762 0.017818 -0.002180 0.002180
Smoking 0.394891 0.016342 -0.012331 0.000527 0.011748 1.000000 0.003969 0.012754 -0.000075 0.006023 ... 0.003096 0.007670 0.004650 -0.006465 -0.005424 -0.004051 -0.009534 -0.012293 -0.514837 0.514837
Obesity -0.008140 -0.014843 0.012725 0.012866 -0.001444 0.003969 1.000000 -0.024195 0.002106 0.003743 ... -0.003870 -0.006058 0.001467 0.005337 -0.005314 -0.013318 -0.001918 -0.020574 -0.002376 0.002376
Alcohol Consumption -0.006666 -0.007261 0.003459 0.005551 0.012701 0.012754 -0.024195 1.000000 -0.008654 0.005336 ... -0.022396 0.010562 0.006169 0.001593 -0.000843 -0.013778 0.010764 -0.007282 -0.002085 0.002085
Exercise Hours Per Week 0.001341 0.021546 0.008778 -0.006608 -0.006486 -0.000075 0.002106 -0.008654 1.000000 0.008124 ... -0.023092 0.004028 0.001237 0.007002 -0.001407 0.010524 -0.009280 -0.003447 0.006824 -0.006824
Diet -0.013230 -0.010765 -0.003014 0.006156 -0.001401 0.006023 0.003743 0.005336 0.008124 1.000000 ... -0.000659 0.011755 -0.013660 -0.013265 -0.014513 0.005908 0.013648 0.005636 -0.005740 0.005740
Previous Heart Problems 0.000868 -0.006070 -0.004956 0.000867 -0.004568 -0.000574 0.005159 0.010395 0.005627 0.019773 ... -0.003281 0.015718 -0.019029 0.008537 0.004460 0.000274 -0.011926 0.008813 -0.001964 0.001964
Medication Use 0.000980 -0.000905 0.009244 -0.002656 0.000981 -0.010877 -0.006267 0.003339 -0.007231 -0.019940 ... -0.003464 0.009514 -0.011095 -0.011139 -0.020393 0.002234 -0.001182 0.004607 0.007148 -0.007148
Stress Level 0.018307 -0.024487 -0.004547 0.006719 0.015637 -0.001757 0.010626 -0.005023 -0.009486 0.012093 ... -0.002760 -0.003250 -0.003921 0.007405 -0.014205 -0.004111 0.017848 -0.008445 0.021835 -0.021835
Sedentary Hours Per Day 0.017280 0.018914 -0.010232 0.004705 0.002561 0.015311 -0.001333 -0.012828 0.008188 0.002100 ... 0.003511 -0.000024 -0.005785 -0.006178 0.004792 -0.005613 0.003393 -0.006606 -0.002995 0.002995
Income -0.001733 0.000007 0.004873 -0.000759 -0.000401 0.003096 -0.003870 -0.022396 -0.023092 -0.000659 ... 1.000000 0.008836 0.010739 0.000130 -0.006598 0.009628 0.010414 0.008816 -0.002660 0.002660
BMI -0.002612 0.017292 0.005299 -0.002852 -0.011492 0.007670 -0.006058 0.010562 0.004028 0.011755 ... 0.008836 1.000000 -0.005964 0.008110 -0.010030 0.000020 0.004279 0.000806 0.003021 -0.003021
Triglycerides 0.003415 -0.005454 0.012244 0.010431 -0.001904 0.004650 0.001467 0.006169 0.001237 -0.013660 ... 0.010739 -0.005964 1.000000 -0.007556 -0.029216 0.010471 0.005121 0.000545 -0.002933 0.002933
Physical Activity Days Per Week 0.001384 0.016056 0.000834 -0.002411 0.009561 -0.006465 0.005337 0.001593 0.007002 -0.013265 ... 0.000130 0.008110 -0.007556 1.000000 0.014033 -0.005014 -0.007574 0.016294 0.007660 -0.007660
Sleep Hours Per Day -0.002185 0.004456 0.001811 -0.012457 -0.011199 -0.005424 -0.005314 -0.000843 -0.001407 -0.014513 ... -0.006598 -0.010030 -0.029216 0.014033 1.000000 -0.018528 -0.004628 0.010679 0.005329 -0.005329
Heart Attack Risk 0.006403 0.019340 -0.004251 0.017225 -0.001652 -0.004051 -0.013318 -0.013778 0.010524 0.005908 ... 0.009628 0.000020 0.010471 -0.005014 -0.018528 1.000000 0.018585 -0.007509 -0.003095 0.003095
Systolic 0.003070 0.000133 0.008482 -0.005306 -0.009762 -0.009534 -0.001918 0.010764 -0.009280 0.013648 ... 0.010414 0.004279 0.005121 -0.007574 -0.004628 0.018585 1.000000 0.013337 0.006037 -0.006037
Diastolic -0.009826 0.002083 -0.018113 -0.000512 0.017818 -0.012293 -0.020574 -0.007282 -0.003447 0.005636 ... 0.008816 0.000806 0.000545 0.016294 0.010679 -0.007509 0.013337 1.000000 0.002251 -0.002251
Sex_Female -0.020067 -0.007614 0.010921 -0.003582 -0.002180 -0.514837 -0.002376 -0.002085 0.006824 -0.005740 ... -0.002660 0.003021 -0.002933 0.007660 0.005329 -0.003095 0.006037 0.002251 1.000000 -1.000000
Sex_Male 0.020067 0.007614 -0.010921 0.003582 0.002180 0.514837 0.002376 0.002085 -0.006824 0.005740 ... 0.002660 -0.003021 0.002933 -0.007660 -0.005329 0.003095 -0.006037 -0.002251 -1.000000 1.000000

24 rows × 24 columns

In [44]:
plt.figure(figsize = (19,19))
sns.heatmap(df[['Age','Heart Rate', 'Diabetes', 'Family History', 'Smoking', 'Obesity',
      'Cholesterol','Alcohol Consumption', 'Exercise Hours Per Week',
     'Previous Heart Problems', 'Medication Use', 'Stress Level',
       'Sedentary Hours Per Day', 'Income', 'BMI', 'Triglycerides',
       'Physical Activity Days Per Week', 'Sleep Hours Per Day','Heart Attack Risk']].corr(), cmap="YlGnBu",
            annot=True)
Out[44]:
<Axes: >
In [45]:
'''
from High Correlation Filter
from here its observed that:
Heart attack prediction not much dependent on the below:
1) smoking
2) diastolic
3) obesity
4) previous heart problems
5) diabetes
6)family history
7) Alcohol Consumption

8) Medication level

'''
Out[45]:
'\nfrom High Correlation Filter\nfrom here its observed that:\nHeart attack prediction not much dependent on the below:\n1) smoking\n2) diastolic\n3) obesity\n4) previous heart problems\n5) diabetes\n6)family history\n7) Alcohol Consumption\n\n8) Medication level\n\n'

Feature Selection Using Random forest¶

In [46]:
'''
Target here is 'Heart Attack Risk' which is numeric.
'''
x_rf=df.drop('Heart Attack Risk',axis=1)
y_rf = df['Heart Attack Risk']
In [47]:
x_train_rf,x_test_rf,y_train_rf,y_test_rf=train_test_split(x_rf,y_rf,random_state=0,test_size=0.2)
In [48]:
rf = RandomForestClassifier(random_state=0)
In [49]:
rf.fit(x_train_rf,y_train_rf)
Out[49]:
RandomForestClassifier(random_state=0)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(random_state=0)
In [50]:
f_i = list(zip(df,rf.feature_importances_))
f_i.sort(key=lambda  x:x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i])
plt.show()
In [51]:
'''
from Feature Selection Using Random forest:

from here its observed that:

Heart attack prediction not much dependent on the below: => such as the features that raised in high corelation filter.

1) smoking
2) diastolic
3) obesity
4) previous heart problems
5) diabetes
6)family history
7) Alcohol Consumption

8) Medication level

9)Sex

so i will drop them

'''
Out[51]:
'\nfrom Feature Selection Using Random forest:\n\nfrom here its observed that:\n\nHeart attack prediction not much dependent on the below: => such as the features that raised in high corelation filter.\n\n1) smoking\n2) diastolic\n3) obesity\n4) previous heart problems\n5) diabetes\n6)family history\n7) Alcohol Consumption\n\n8) Medication level\n\n9)Sex\n\nso i will drop them\n\n'
In [52]:
df.columns
Out[52]:
Index(['Age', 'Cholesterol', 'Heart Rate', 'Diabetes', 'Family History',
       'Smoking', 'Obesity', 'Alcohol Consumption', 'Exercise Hours Per Week',
       'Diet', 'Previous Heart Problems', 'Medication Use', 'Stress Level',
       'Sedentary Hours Per Day', 'Income', 'BMI', 'Triglycerides',
       'Physical Activity Days Per Week', 'Sleep Hours Per Day',
       'Heart Attack Risk', 'Systolic', 'Diastolic', 'Sex_Female', 'Sex_Male'],
      dtype='object')
In [53]:
df.drop('Smoking',axis=1,inplace = True)
df.drop('Diastolic',axis=1,inplace = True)
df.drop('Obesity',axis=1,inplace = True)
df.drop('Previous Heart Problems',axis=1,inplace = True)
df.drop('Diabetes',axis=1,inplace = True)
df.drop('Family History',axis=1,inplace = True)
df.drop('Alcohol Consumption',axis=1,inplace = True)
df.drop('Medication Use',axis=1,inplace = True)
df.drop('Sex_Female',axis=1,inplace = True)
df.drop('Sex_Male',axis=1,inplace = True)
In [54]:
df.head()
Out[54]:
Age Cholesterol Heart Rate Exercise Hours Per Week Diet Stress Level Sedentary Hours Per Day Income BMI Triglycerides Physical Activity Days Per Week Sleep Hours Per Day Heart Attack Risk Systolic
0 67 208 72 4.168189 1 9 6.615001 261404 31.251233 286 0 6 0 158
1 21 389 98 1.813242 0 1 4.963459 285768 27.194973 235 1 7 0 165
2 21 324 72 2.078353 2 9 9.463426 235282 28.176571 587 4 4 0 174
3 84 383 73 9.828130 1 9 7.648981 125640 36.464704 378 3 4 0 163
4 66 318 93 10.070897 0 6 1.514821 160555 21.809144 231 1 5 0 91
In [55]:
df.to_csv("processed_heart_attack.csv",index=False)
In [56]:
df.columns
Out[56]:
Index(['Age', 'Cholesterol', 'Heart Rate', 'Exercise Hours Per Week', 'Diet',
       'Stress Level', 'Sedentary Hours Per Day', 'Income', 'BMI',
       'Triglycerides', 'Physical Activity Days Per Week',
       'Sleep Hours Per Day', 'Heart Attack Risk', 'Systolic'],
      dtype='object')